import sentencepiece as spm


def train(input_file, vocab_size, model_name, model_type, character_coverage):
    input_argument = '--input=%s --model_prefix=%s --vocab_size=%s --model_type=%s --character_coverage=%s ' \
                     '--pad_id=0 --unk_id=1 --bos_id=2 --eos_id=3 '
    cmd = input_argument % (input_file, model_name, vocab_size, model_type, character_coverage)
    spm.SentencePieceTrainer.Train(cmd)


def run():
    jp_input = '../data/corpus.jp'
    jp_vocab_size = 32000
    jp_model_name = 'jap'
    jp_model_type = 'bpe'
    jp_character_coverage = 0.9995 # 日本語は0.9995推奨
    train(jp_input, jp_vocab_size, jp_model_name, jp_model_type, jp_character_coverage)

    ch_input = '../data/corpus.ch'
    ch_vocab_size = 32000
    ch_model_name = 'chn'
    ch_model_type = 'bpe'
    ch_character_coverage = 0.9995
    train(ch_input, ch_vocab_size, ch_model_name, ch_model_type, ch_character_coverage)


if __name__ == "__main__":
    run()

